
/*---------------------------------------------------------------*/
/*--- begin                               guest_x86_helpers.c ---*/
/*---------------------------------------------------------------*/

/*
   This file is part of Valgrind, a dynamic binary instrumentation
   framework.

   Copyright (C) 2004-2017 OpenWorks LLP
      info@open-works.net

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, see <http://www.gnu.org/licenses/>.

   The GNU General Public License is contained in the file COPYING.

   Neither the names of the U.S. Department of Energy nor the
   University of California nor the names of its contributors may be
   used to endorse or promote products derived from this software
   without prior written permission.
*/

#include "libvex_basictypes.h"
#include "libvex_emnote.h"
#include "libvex_guest_x86.h"
#include "libvex_ir.h"
#include "libvex.h"

#include "main_util.h"
#include "main_globals.h"
#include "guest_generic_bb_to_IR.h"
#include "guest_x86_defs.h"
#include "guest_generic_x87.h"


/* This file contains helper functions for x86 guest code.
   Calls to these functions are generated by the back end.
   These calls are of course in the host machine code and 
   this file will be compiled to host machine code, so that
   all makes sense.  

   Only change the signatures of these helper functions very
   carefully.  If you change the signature here, you'll have to change
   the parameters passed to it in the IR calls constructed by
   guest-x86/toIR.c.

   The convention used is that all functions called from generated
   code are named x86g_<something>, and any function whose name lacks
   that prefix is not called from generated code.  Note that some
   LibVEX_* functions can however be called by VEX's client, but that
   is not the same as calling them from VEX-generated code.
*/


/* Set to 1 to get detailed profiling info about use of the flag
   machinery. */
#define PROFILE_EFLAGS 0


/*---------------------------------------------------------------*/
/*--- %eflags run-time helpers.                               ---*/
/*---------------------------------------------------------------*/

static const UChar parity_table[256] = {
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
};

/* generalised left-shifter */
inline static Int lshift ( Int x, Int n )
{
   if (n >= 0)
      return (UInt)x << n;
   else
      return x >> (-n);
}

/* identity on ULong */
static inline ULong idULong ( ULong x )
{
   return x;
}


#define PREAMBLE(__data_bits)					\
   /* const */ UInt DATA_MASK 					\
      = __data_bits==8 ? 0xFF 					\
                       : (__data_bits==16 ? 0xFFFF 		\
                                          : 0xFFFFFFFF); 	\
   /* const */ UInt SIGN_MASK = 1u << (__data_bits - 1);	\
   /* const */ UInt CC_DEP1 = cc_dep1_formal;			\
   /* const */ UInt CC_DEP2 = cc_dep2_formal;			\
   /* const */ UInt CC_NDEP = cc_ndep_formal;			\
   /* Four bogus assignments, which hopefully gcc can     */	\
   /* optimise away, and which stop it complaining about  */	\
   /* unused variables.                                   */	\
   SIGN_MASK = SIGN_MASK;					\
   DATA_MASK = DATA_MASK;					\
   CC_DEP2 = CC_DEP2;						\
   CC_NDEP = CC_NDEP;


/*-------------------------------------------------------------*/

#define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt cf, pf, af, zf, sf, of;				\
     UInt argL, argR, res;					\
     argL = CC_DEP1;						\
     argR = CC_DEP2;						\
     res  = argL + argR;					\
     cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                 12 - DATA_BITS) & X86G_CC_MASK_O;		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt cf, pf, af, zf, sf, of;				\
     UInt argL, argR, res;					\
     argL = CC_DEP1;						\
     argR = CC_DEP2;						\
     res  = argL - argR;					\
     cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = lshift((argL ^ argR) & (argL ^ res),	 		\
                 12 - DATA_BITS) & X86G_CC_MASK_O; 		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt cf, pf, af, zf, sf, of;				\
     UInt argL, argR, oldC, res;		       		\
     oldC = CC_NDEP & X86G_CC_MASK_C;				\
     argL = CC_DEP1;						\
     argR = CC_DEP2 ^ oldC;	       				\
     res  = (argL + argR) + oldC;				\
     if (oldC)							\
        cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
     else							\
        cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
                  12 - DATA_BITS) & X86G_CC_MASK_O;		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt cf, pf, af, zf, sf, of;				\
     UInt argL, argR, oldC, res;		       		\
     oldC = CC_NDEP & X86G_CC_MASK_C;				\
     argL = CC_DEP1;						\
     argR = CC_DEP2 ^ oldC;	       				\
     res  = (argL - argR) - oldC;				\
     if (oldC)							\
        cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
     else							\
        cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = lshift((argL ^ argR) & (argL ^ res), 			\
                 12 - DATA_BITS) & X86G_CC_MASK_O;		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt cf, pf, af, zf, sf, of;				\
     cf = 0;							\
     pf = parity_table[(UChar)CC_DEP1];				\
     af = 0;							\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     of = 0;							\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt cf, pf, af, zf, sf, of;				\
     UInt argL, argR, res;					\
     res  = CC_DEP1;						\
     argL = res - 1;						\
     argR = 1;							\
     cf = CC_NDEP & X86G_CC_MASK_C;				\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt cf, pf, af, zf, sf, of;				\
     UInt argL, argR, res;					\
     res  = CC_DEP1;						\
     argL = res + 1;						\
     argR = 1;							\
     cf = CC_NDEP & X86G_CC_MASK_C;				\
     pf = parity_table[(UChar)res];				\
     af = (res ^ argL ^ argR) & 0x10;				\
     zf = ((DATA_UTYPE)res == 0) << 6;				\
     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
     of = ((res & DATA_MASK) 					\
          == ((UInt)SIGN_MASK - 1)) << 11;			\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt cf, pf, af, zf, sf, of;				\
     cf = (CC_DEP2 >> (DATA_BITS - 1)) & X86G_CC_MASK_C;	\
     pf = parity_table[(UChar)CC_DEP1];				\
     af = 0; /* undefined */					\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     /* of is defined if shift count == 1 */			\
     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
          & X86G_CC_MASK_O;					\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);  					\
   { UInt cf, pf, af, zf, sf, of;				\
     cf = CC_DEP2 & 1;						\
     pf = parity_table[(UChar)CC_DEP1];				\
     af = 0; /* undefined */					\
     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
     /* of is defined if shift count == 1 */			\
     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
          & X86G_CC_MASK_O;					\
     return cf | pf | af | zf | sf | of;			\
   }								\
}

/*-------------------------------------------------------------*/

/* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
/* DEP1 = result, NDEP = old flags */
#define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt fl 							\
        = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
          | (X86G_CC_MASK_C & CC_DEP1)				\
          | (X86G_CC_MASK_O & (lshift(CC_DEP1,  		\
                                      11-(DATA_BITS-1)) 	\
                     ^ lshift(CC_DEP1, 11)));			\
     return fl;							\
   }								\
}

/*-------------------------------------------------------------*/

/* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
/* DEP1 = result, NDEP = old flags */
#define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
{								\
   PREAMBLE(DATA_BITS);						\
   { UInt fl 							\
        = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
          | (X86G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
          | (X86G_CC_MASK_O & (lshift(CC_DEP1, 			\
                                      11-(DATA_BITS-1)) 	\
                     ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
     return fl;							\
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
                                DATA_U2TYPE, NARROWto2U)        \
{                                                               \
   PREAMBLE(DATA_BITS);                                         \
   { UInt cf, pf, af, zf, sf, of;                               \
     DATA_UTYPE  hi;                                            \
     DATA_UTYPE  lo                                             \
        = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
                     * ((DATA_UTYPE)CC_DEP2) );                 \
     DATA_U2TYPE rr                                             \
        = NARROWto2U(                                           \
             ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
             * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
     hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
     cf = (hi != 0);                                            \
     pf = parity_table[(UChar)lo];                              \
     af = 0; /* undefined */                                    \
     zf = (lo == 0) << 6;                                       \
     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
     of = cf << 11;                                             \
     return cf | pf | af | zf | sf | of;                        \
   }								\
}

/*-------------------------------------------------------------*/

#define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
                                DATA_S2TYPE, NARROWto2S)        \
{                                                               \
   PREAMBLE(DATA_BITS);                                         \
   { UInt cf, pf, af, zf, sf, of;                               \
     DATA_STYPE  hi;                                            \
     DATA_STYPE  lo                                             \
        = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
                     * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
     DATA_S2TYPE rr                                             \
        = NARROWto2S(                                           \
             ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
             * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
     hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
     cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
     pf = parity_table[(UChar)lo];                              \
     af = 0; /* undefined */                                    \
     zf = (lo == 0) << 6;                                       \
     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
     of = cf << 11;                                             \
     return cf | pf | af | zf | sf | of;                        \
   }								\
}


#if PROFILE_EFLAGS

static Bool initted     = False;

/* C flag, fast route */
static UInt tabc_fast[X86G_CC_OP_NUMBER];
/* C flag, slow route */
static UInt tabc_slow[X86G_CC_OP_NUMBER];
/* table for calculate_cond */
static UInt tab_cond[X86G_CC_OP_NUMBER][16];
/* total entry counts for calc_all, calc_c, calc_cond. */
static UInt n_calc_all  = 0;
static UInt n_calc_c    = 0;
static UInt n_calc_cond = 0;

#define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))


static void showCounts ( void )
{
   Int op, co;
   HChar ch;
   vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
              n_calc_all, n_calc_cond, n_calc_c);

   vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
              "    S   NS    P   NP    L   NL   LE  NLE\n");
   vex_printf("     -----------------------------------------------------"
              "----------------------------------------\n");
   for (op = 0; op < X86G_CC_OP_NUMBER; op++) {

      ch = ' ';
      if (op > 0 && (op-1) % 3 == 0) 
         ch = 'B';
      if (op > 0 && (op-1) % 3 == 1) 
         ch = 'W';
      if (op > 0 && (op-1) % 3 == 2) 
         ch = 'L';

      vex_printf("%2d%c: ", op, ch);
      vex_printf("%6u ", tabc_slow[op]);
      vex_printf("%6u ", tabc_fast[op]);
      for (co = 0; co < 16; co++) {
         Int n = tab_cond[op][co];
         if (n >= 1000) {
            vex_printf(" %3dK", n / 1000);
         } else 
         if (n >= 0) {
            vex_printf(" %3d ", n );
         } else {
            vex_printf("     ");
         }
      }
      vex_printf("\n");
   }
   vex_printf("\n");
}

static void initCounts ( void )
{
   Int op, co;
   initted = True;
   for (op = 0; op < X86G_CC_OP_NUMBER; op++) {
      tabc_fast[op] = tabc_slow[op] = 0;
      for (co = 0; co < 16; co++)
         tab_cond[op][co] = 0;
   }
}

#endif /* PROFILE_EFLAGS */


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate all the 6 flags from the supplied thunk parameters.
   Worker function, not directly called from generated code. */
static
UInt x86g_calculate_eflags_all_WRK ( UInt cc_op, 
                                     UInt cc_dep1_formal, 
                                     UInt cc_dep2_formal,
                                     UInt cc_ndep_formal )
{
   switch (cc_op) {
      case X86G_CC_OP_COPY:
         return cc_dep1_formal
                & (X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z 
                   | X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P);

      case X86G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
      case X86G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
      case X86G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );

      case X86G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
      case X86G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
      case X86G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );

      case X86G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
      case X86G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
      case X86G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );

      case X86G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
      case X86G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
      case X86G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );

      case X86G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
      case X86G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
      case X86G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );

      case X86G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
      case X86G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
      case X86G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );

      case X86G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
      case X86G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
      case X86G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );

      case X86G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
      case X86G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
      case X86G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );

      case X86G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
      case X86G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
      case X86G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );

      case X86G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
      case X86G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
      case X86G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );

      case X86G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
      case X86G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
      case X86G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );

      case X86G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
                                                UShort, toUShort );
      case X86G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
                                                UInt,   toUInt );
      case X86G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
                                                ULong,  idULong );

      case X86G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
                                                Short,  toUShort );
      case X86G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort, 
                                                Int,    toUInt   );
      case X86G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
                                                Long,   idULong );

      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("x86g_calculate_eflags_all_WRK(X86)"
                    "( %u, 0x%x, 0x%x, 0x%x )\n",
                    cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
         vpanic("x86g_calculate_eflags_all_WRK(X86)");
   }
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate all the 6 flags from the supplied thunk parameters. */
UInt x86g_calculate_eflags_all ( UInt cc_op, 
                                 UInt cc_dep1, 
                                 UInt cc_dep2,
                                 UInt cc_ndep )
{
#  if PROFILE_EFLAGS
   if (!initted) initCounts();
   n_calc_all++;
   if (SHOW_COUNTS_NOW) showCounts();
#  endif
   return
      x86g_calculate_eflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate just the carry flag from the supplied thunk parameters. */
VEX_REGPARM(3)
UInt x86g_calculate_eflags_c ( UInt cc_op, 
                               UInt cc_dep1, 
                               UInt cc_dep2,
                               UInt cc_ndep )
{
#  if PROFILE_EFLAGS
   if (!initted) initCounts();
   n_calc_c++;
   tabc_fast[cc_op]++;
   if (SHOW_COUNTS_NOW) showCounts();
#  endif

   /* Fast-case some common ones. */
   switch (cc_op) {
      case X86G_CC_OP_LOGICL: 
      case X86G_CC_OP_LOGICW: 
      case X86G_CC_OP_LOGICB:
         return 0;
      case X86G_CC_OP_SUBL:
         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
                   ? X86G_CC_MASK_C : 0;
      case X86G_CC_OP_SUBW:
         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
                   ? X86G_CC_MASK_C : 0;
      case X86G_CC_OP_SUBB:
         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
                   ? X86G_CC_MASK_C : 0;
      case X86G_CC_OP_INCL:
      case X86G_CC_OP_DECL:
         return cc_ndep & X86G_CC_MASK_C;
      default: 
         break;
   }

#  if PROFILE_EFLAGS
   tabc_fast[cc_op]--;
   tabc_slow[cc_op]++;
#  endif

   return x86g_calculate_eflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep) 
          & X86G_CC_MASK_C;
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* returns 1 or 0 */
UInt x86g_calculate_condition ( UInt/*X86Condcode*/ cond, 
                                UInt cc_op, 
                                UInt cc_dep1, 
                                UInt cc_dep2,
                                UInt cc_ndep )
{
   UInt eflags = x86g_calculate_eflags_all_WRK(cc_op, cc_dep1, 
                                               cc_dep2, cc_ndep);
   UInt of,sf,zf,cf,pf;
   UInt inv = cond & 1;

#  if PROFILE_EFLAGS
   if (!initted) initCounts();
   tab_cond[cc_op][cond]++;
   n_calc_cond++;
   if (SHOW_COUNTS_NOW) showCounts();
#  endif

   switch (cond) {
      case X86CondNO:
      case X86CondO: /* OF == 1 */
         of = eflags >> X86G_CC_SHIFT_O;
         return 1 & (inv ^ of);

      case X86CondNZ:
      case X86CondZ: /* ZF == 1 */
         zf = eflags >> X86G_CC_SHIFT_Z;
         return 1 & (inv ^ zf);

      case X86CondNB:
      case X86CondB: /* CF == 1 */
         cf = eflags >> X86G_CC_SHIFT_C;
         return 1 & (inv ^ cf);
         break;

      case X86CondNBE:
      case X86CondBE: /* (CF or ZF) == 1 */
         cf = eflags >> X86G_CC_SHIFT_C;
         zf = eflags >> X86G_CC_SHIFT_Z;
         return 1 & (inv ^ (cf | zf));
         break;

      case X86CondNS:
      case X86CondS: /* SF == 1 */
         sf = eflags >> X86G_CC_SHIFT_S;
         return 1 & (inv ^ sf);

      case X86CondNP:
      case X86CondP: /* PF == 1 */
         pf = eflags >> X86G_CC_SHIFT_P;
         return 1 & (inv ^ pf);

      case X86CondNL:
      case X86CondL: /* (SF xor OF) == 1 */
         sf = eflags >> X86G_CC_SHIFT_S;
         of = eflags >> X86G_CC_SHIFT_O;
         return 1 & (inv ^ (sf ^ of));
         break;

      case X86CondNLE:
      case X86CondLE: /* ((SF xor OF) or ZF)  == 1 */
         sf = eflags >> X86G_CC_SHIFT_S;
         of = eflags >> X86G_CC_SHIFT_O;
         zf = eflags >> X86G_CC_SHIFT_Z;
         return 1 & (inv ^ ((sf ^ of) | zf));
         break;

      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("x86g_calculate_condition( %u, %u, 0x%x, 0x%x, 0x%x )\n",
                    cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
         vpanic("x86g_calculate_condition");
   }
}


/* VISIBLE TO LIBVEX CLIENT */
UInt LibVEX_GuestX86_get_eflags ( /*IN*/const VexGuestX86State* vex_state )
{
   UInt eflags = x86g_calculate_eflags_all_WRK(
                    vex_state->guest_CC_OP,
                    vex_state->guest_CC_DEP1,
                    vex_state->guest_CC_DEP2,
                    vex_state->guest_CC_NDEP
                 );
   UInt dflag = vex_state->guest_DFLAG;
   vassert(dflag == 1 || dflag == 0xFFFFFFFF);
   if (dflag == 0xFFFFFFFF)
      eflags |= X86G_CC_MASK_D;
   if (vex_state->guest_IDFLAG == 1)
      eflags |= X86G_CC_MASK_ID;
   if (vex_state->guest_ACFLAG == 1)
      eflags |= X86G_CC_MASK_AC;
					     
   return eflags;
}

/* VISIBLE TO LIBVEX CLIENT */
void
LibVEX_GuestX86_put_eflags ( UInt eflags,
                             /*MOD*/VexGuestX86State* vex_state )
{
   /* D flag */
   if (eflags & X86G_CC_MASK_D) {
      vex_state->guest_DFLAG = 0xFFFFFFFF;
      eflags &= ~X86G_CC_MASK_D;
   }
   else
      vex_state->guest_DFLAG = 1;

   /* ID flag */
   if (eflags & X86G_CC_MASK_ID) {
      vex_state->guest_IDFLAG = 1;
      eflags &= ~X86G_CC_MASK_ID;
   }
   else
      vex_state->guest_IDFLAG = 0;

   /* AC flag */
   if (eflags & X86G_CC_MASK_AC) {
      vex_state->guest_ACFLAG = 1;
      eflags &= ~X86G_CC_MASK_AC;
   }
   else
      vex_state->guest_ACFLAG = 0;

   UInt cc_mask = X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z |
                  X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P;
   vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
   vex_state->guest_CC_DEP1 = eflags & cc_mask;
   vex_state->guest_CC_DEP2 = 0;
   vex_state->guest_CC_NDEP = 0;
}

/* VISIBLE TO LIBVEX CLIENT */
void
LibVEX_GuestX86_put_eflag_c ( UInt new_carry_flag,
                              /*MOD*/VexGuestX86State* vex_state )
{
   UInt oszacp = x86g_calculate_eflags_all_WRK(
                    vex_state->guest_CC_OP,
                    vex_state->guest_CC_DEP1,
                    vex_state->guest_CC_DEP2,
                    vex_state->guest_CC_NDEP
                 );
   if (new_carry_flag & 1) {
      oszacp |= X86G_CC_MASK_C;
   } else {
      oszacp &= ~X86G_CC_MASK_C;
   }
   vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
   vex_state->guest_CC_DEP1 = oszacp;
   vex_state->guest_CC_DEP2 = 0;
   vex_state->guest_CC_NDEP = 0;
}


/*---------------------------------------------------------------*/
/*--- %eflags translation-time function specialisers.         ---*/
/*--- These help iropt specialise calls the above run-time    ---*/
/*--- %eflags functions.                                      ---*/
/*---------------------------------------------------------------*/

/* Used by the optimiser to try specialisations.  Returns an
   equivalent expression, or NULL if none. */

static inline Bool isU32 ( IRExpr* e, UInt n )
{
   return 
      toBool( e->tag == Iex_Const
              && e->Iex.Const.con->tag == Ico_U32
              && e->Iex.Const.con->Ico.U32 == n );
}

IRExpr* guest_x86_spechelper ( const HChar* function_name,
                               IRExpr** args,
                               IRStmt** precedingStmts,
                               Int      n_precedingStmts )
{
#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))

   Int i, arity = 0;
   for (i = 0; args[i]; i++)
      arity++;
#  if 0
   vex_printf("spec request:\n");
   vex_printf("   %s  ", function_name);
   for (i = 0; i < arity; i++) {
      vex_printf("  ");
      ppIRExpr(args[i]);
   }
   vex_printf("\n");
#  endif

   /* --------- specialising "x86g_calculate_condition" --------- */

   if (vex_streq(function_name, "x86g_calculate_condition")) {
      /* specialise calls to above "calculate condition" function */
      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
      vassert(arity == 5);
      cond    = args[0];
      cc_op   = args[1];
      cc_dep1 = args[2];
      cc_dep2 = args[3];

      /*---------------- ADDL ----------------*/

      if (isU32(cc_op, X86G_CC_OP_ADDL) && isU32(cond, X86CondZ)) {
         /* long add, then Z --> test (dst+src == 0) */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpEQ32, 
                           binop(Iop_Add32, cc_dep1, cc_dep2),
                           mkU32(0)));
      }

      /*---------------- SUBL ----------------*/

      /* 4, 5 */
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondZ)) {
         /* long sub/cmp, then Z --> test dst==src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpEQ32, cc_dep1, cc_dep2));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNZ)) {
         /* long sub/cmp, then NZ --> test dst!=src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpNE32, cc_dep1, cc_dep2));
      }

      /* 12, 13 */
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondL)) {
         /* long sub/cmp, then L (signed less than) 
            --> test dst <s src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLT32S, cc_dep1, cc_dep2));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNL)) {
         /* long sub/cmp, then NL (signed greater than or equal) 
            --> test !(dst <s src) */
         return binop(Iop_Xor32,
                      unop(Iop_1Uto32,
                           binop(Iop_CmpLT32S, cc_dep1, cc_dep2)),
                      mkU32(1));
      }

      /* 14, 15 */
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondLE)) {
         /* long sub/cmp, then LE (signed less than or equal)
            --> test dst <=s src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLE32S, cc_dep1, cc_dep2));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNLE)) {
         /* long sub/cmp, then NLE (signed not less than or equal)
            --> test dst >s src 
            --> test !(dst <=s src) */
         return binop(Iop_Xor32,
                      unop(Iop_1Uto32,
                           binop(Iop_CmpLE32S, cc_dep1, cc_dep2)),
                      mkU32(1));
      }

      /* 6, 7 */
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondBE)) {
         /* long sub/cmp, then BE (unsigned less than or equal)
            --> test dst <=u src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLE32U, cc_dep1, cc_dep2));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNBE)) {
         /* long sub/cmp, then BE (unsigned greater than)
            --> test !(dst <=u src) */
         return binop(Iop_Xor32,
                      unop(Iop_1Uto32,
                           binop(Iop_CmpLE32U, cc_dep1, cc_dep2)),
                      mkU32(1));
      }

      /* 2, 3 */
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondB)) {
         /* long sub/cmp, then B (unsigned less than)
            --> test dst <u src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNB)) {
         /* long sub/cmp, then NB (unsigned greater than or equal)
            --> test !(dst <u src) */
         return binop(Iop_Xor32,
                      unop(Iop_1Uto32,
                           binop(Iop_CmpLT32U, cc_dep1, cc_dep2)),
                      mkU32(1));
      }

      /* 8, 9 */
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondS)
                                        && isU32(cc_dep2, 0)) {
         /* long sub/cmp of zero, then S --> test (dst-0 <s 0)
                                         --> test dst <s 0
                                         --> (UInt)dst[31] */
         return binop(Iop_And32,
                      binop(Iop_Shr32,cc_dep1,mkU8(31)),
                      mkU32(1));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNS)
                                        && isU32(cc_dep2, 0)) {
         /* long sub/cmp of zero, then NS --> test !(dst-0 <s 0)
                                          --> test !(dst <s 0)
                                          --> (UInt) !dst[31] */
         return binop(Iop_Xor32,
                      binop(Iop_And32,
                            binop(Iop_Shr32,cc_dep1,mkU8(31)),
                            mkU32(1)),
                mkU32(1));
      }

      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondS)) {
         /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLT32S, 
                           binop(Iop_Sub32, cc_dep1, cc_dep2),
                           mkU32(0)));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNS)) {
         /* long sub/cmp, then NS (not negative) --> test !(dst-src <s 0) */
         return binop(Iop_Xor32,
                      unop(Iop_1Uto32,
                           binop(Iop_CmpLT32S, 
                                 binop(Iop_Sub32, cc_dep1, cc_dep2),
                                 mkU32(0))),
                      mkU32(1));
      }

      /*---------------- SUBW ----------------*/

      if (isU32(cc_op, X86G_CC_OP_SUBW) && isU32(cond, X86CondZ)) {
         /* word sub/cmp, then Z --> test dst==src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpEQ16, 
                           unop(Iop_32to16,cc_dep1), 
                           unop(Iop_32to16,cc_dep2)));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBW) && isU32(cond, X86CondNZ)) {
         /* word sub/cmp, then NZ --> test dst!=src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpNE16, 
                           unop(Iop_32to16,cc_dep1), 
                           unop(Iop_32to16,cc_dep2)));
      }

      /*---------------- SUBB ----------------*/

      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondZ)) {
         /* byte sub/cmp, then Z --> test dst==src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpEQ8, 
                           unop(Iop_32to8,cc_dep1), 
                           unop(Iop_32to8,cc_dep2)));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNZ)) {
         /* byte sub/cmp, then NZ --> test dst!=src */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpNE8, 
                           unop(Iop_32to8,cc_dep1), 
                           unop(Iop_32to8,cc_dep2)));
      }

      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNBE)) {
         /* byte sub/cmp, then NBE (unsigned greater than)
            --> test src <u dst */
         /* Note, args are opposite way round from the usual */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLT32U, 
                           binop(Iop_And32,cc_dep2,mkU32(0xFF)),
			   binop(Iop_And32,cc_dep1,mkU32(0xFF))));
      }

      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondS)
                                        && isU32(cc_dep2, 0)) {
         /* byte sub/cmp of zero, then S --> test (dst-0 <s 0) 
                                         --> test dst <s 0
                                         --> (UInt)dst[7] 
            This is yet another scheme by which gcc figures out if the
            top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
         /* Note: isU32(cc_dep2, 0) is correct, even though this is
            for an 8-bit comparison, since the args to the helper
            function are always U32s. */
         return binop(Iop_And32,
                      binop(Iop_Shr32,cc_dep1,mkU8(7)),
                      mkU32(1));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNS)
                                        && isU32(cc_dep2, 0)) {
         /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0) 
                                          --> test !(dst <s 0)
                                          --> (UInt) !dst[7] 
         */
         return binop(Iop_Xor32,
                      binop(Iop_And32,
                            binop(Iop_Shr32,cc_dep1,mkU8(7)),
                            mkU32(1)),
                mkU32(1));
      }

      /*---------------- LOGICL ----------------*/

      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondZ)) {
         /* long and/or/xor, then Z --> test dst==0 */
         return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
      }
      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondNZ)) {
         /* long and/or/xor, then NZ --> test dst!=0 */
         return unop(Iop_1Uto32,binop(Iop_CmpNE32, cc_dep1, mkU32(0)));
      }

      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondLE)) {
         /* long and/or/xor, then LE
            This is pretty subtle.  LOGIC sets SF and ZF according to the
            result and makes OF be zero.  LE computes (SZ ^ OF) | ZF, but
            OF is zero, so this reduces to SZ | ZF -- which will be 1 iff
            the result is <=signed 0.  Hence ...
         */
         return unop(Iop_1Uto32,binop(Iop_CmpLE32S, cc_dep1, mkU32(0)));
      }

      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondBE)) {
         /* long and/or/xor, then BE
            LOGIC sets ZF according to the result and makes CF be zero.
            BE computes (CF | ZF), but CF is zero, so this reduces ZF 
            -- which will be 1 iff the result is zero.  Hence ...
         */
         return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
      }

      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondS)) {
         /* see comment below for (LOGICB, CondS) */
         /* long and/or/xor, then S --> (UInt)result[31] */
         return binop(Iop_And32,
                      binop(Iop_Shr32,cc_dep1,mkU8(31)),
                      mkU32(1));
      }
      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondNS)) {
         /* see comment below for (LOGICB, CondNS) */
         /* long and/or/xor, then S --> (UInt) ~ result[31] */
         return binop(Iop_Xor32,
                binop(Iop_And32,
                      binop(Iop_Shr32,cc_dep1,mkU8(31)),
                      mkU32(1)),
                mkU32(1));
      }

      /*---------------- LOGICW ----------------*/

      if (isU32(cc_op, X86G_CC_OP_LOGICW) && isU32(cond, X86CondZ)) {
         /* word and/or/xor, then Z --> test dst==0 */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(0xFFFF)), 
                                        mkU32(0)));
      }

      if (isU32(cc_op, X86G_CC_OP_LOGICW) && isU32(cond, X86CondS)) {
         /* see comment below for (LOGICB, CondS) */
         /* word and/or/xor, then S --> (UInt)result[15] */
         return binop(Iop_And32,
                      binop(Iop_Shr32,cc_dep1,mkU8(15)),
                      mkU32(1));
      }

      /*---------------- LOGICB ----------------*/

      if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondZ)) {
         /* byte and/or/xor, then Z --> test dst==0 */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(255)), 
                                        mkU32(0)));
      }
      if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondNZ)) {
         /* byte and/or/xor, then Z --> test dst!=0 */
         /* b9ac9:       84 c0                   test   %al,%al
            b9acb:       75 0d                   jne    b9ada */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpNE32, binop(Iop_And32,cc_dep1,mkU32(255)), 
                                        mkU32(0)));
      }

      if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondS)) {
         /* this is an idiom gcc sometimes uses to find out if the top
            bit of a byte register is set: eg testb %al,%al; js ..
            Since it just depends on the top bit of the byte, extract
            that bit and explicitly get rid of all the rest.  This
            helps memcheck avoid false positives in the case where any
            of the other bits in the byte are undefined. */
         /* byte and/or/xor, then S --> (UInt)result[7] */
         return binop(Iop_And32,
                      binop(Iop_Shr32,cc_dep1,mkU8(7)),
                      mkU32(1));
      }
      if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondNS)) {
         /* ditto, for negation-of-S. */
         /* byte and/or/xor, then S --> (UInt) ~ result[7] */
         return binop(Iop_Xor32,
                binop(Iop_And32,
                      binop(Iop_Shr32,cc_dep1,mkU8(7)),
                      mkU32(1)),
                mkU32(1));
      }

      /*---------------- DECL ----------------*/

      if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondZ)) {
         /* dec L, then Z --> test dst == 0 */
         return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
      }

      if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondS)) {
         /* dec L, then S --> compare DST <s 0 */
         return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
      }

      /*---------------- DECW ----------------*/

      if (isU32(cc_op, X86G_CC_OP_DECW) && isU32(cond, X86CondZ)) {
         /* dec W, then Z --> test dst == 0 */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpEQ32, 
                           binop(Iop_Shl32,cc_dep1,mkU8(16)), 
                           mkU32(0)));
      }

      /*---------------- INCW ----------------*/

      if (isU32(cc_op, X86G_CC_OP_INCW) && isU32(cond, X86CondZ)) {
         /* This rewrite helps memcheck on 'incw %ax ; je ...'. */
         /* inc W, then Z --> test dst == 0 */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpEQ32, 
                           binop(Iop_Shl32,cc_dep1,mkU8(16)),
                           mkU32(0)));
      }

      /*---------------- SHRL ----------------*/

      if (isU32(cc_op, X86G_CC_OP_SHRL) && isU32(cond, X86CondZ)) {
         /* SHRL, then Z --> test dep1 == 0 */
         return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
      }

      /*---------------- COPY ----------------*/
      /* This can happen, as a result of x87 FP compares: "fcom ... ;
         fnstsw %ax ; sahf ; jbe" for example. */

      if (isU32(cc_op, X86G_CC_OP_COPY) && 
          (isU32(cond, X86CondBE) || isU32(cond, X86CondNBE))) {
         /* COPY, then BE --> extract C and Z from dep1, and test 
            (C or Z) == 1. */
         /* COPY, then NBE --> extract C and Z from dep1, and test
            (C or Z) == 0. */
         UInt nnn = isU32(cond, X86CondBE) ? 1 : 0;
         return
            unop(
               Iop_1Uto32,
               binop(
                  Iop_CmpEQ32,
                  binop(
                     Iop_And32,
                     binop(
                        Iop_Or32,
                        binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
                        binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_Z))
                     ),
                     mkU32(1)
                  ),
                  mkU32(nnn)
               )
            );
      }
      
      if (isU32(cc_op, X86G_CC_OP_COPY) 
          && (isU32(cond, X86CondB) || isU32(cond, X86CondNB))) {
         /* COPY, then B --> extract C from dep1, and test (C == 1). */
         /* COPY, then NB --> extract C from dep1, and test (C == 0). */
         UInt nnn = isU32(cond, X86CondB) ? 1 : 0;
         return
            unop(
               Iop_1Uto32,
               binop(
                  Iop_CmpEQ32,
                  binop(
                     Iop_And32,
                     binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
                     mkU32(1)
                  ),
                  mkU32(nnn)
               )
            );
      }

      if (isU32(cc_op, X86G_CC_OP_COPY) 
          && (isU32(cond, X86CondZ) || isU32(cond, X86CondNZ))) {
         /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
         /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
         UInt nnn = isU32(cond, X86CondZ) ? 1 : 0;
         return
            unop(
               Iop_1Uto32,
               binop(
                  Iop_CmpEQ32,
                  binop(
                     Iop_And32,
                     binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_Z)),
                     mkU32(1)
                  ),
                  mkU32(nnn)
               )
            );
      }

      if (isU32(cc_op, X86G_CC_OP_COPY) 
          && (isU32(cond, X86CondP) || isU32(cond, X86CondNP))) {
         /* COPY, then P --> extract P from dep1, and test (P == 1). */
         /* COPY, then NP --> extract P from dep1, and test (P == 0). */
         UInt nnn = isU32(cond, X86CondP) ? 1 : 0;
         return
            unop(
               Iop_1Uto32,
               binop(
                  Iop_CmpEQ32,
                  binop(
                     Iop_And32,
                     binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_P)),
                     mkU32(1)
                  ),
                  mkU32(nnn)
               )
            );
      }

      return NULL;
   }

   /* --------- specialising "x86g_calculate_eflags_c" --------- */

   if (vex_streq(function_name, "x86g_calculate_eflags_c")) {
      /* specialise calls to above "calculate_eflags_c" function */
      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
      vassert(arity == 4);
      cc_op   = args[0];
      cc_dep1 = args[1];
      cc_dep2 = args[2];
      cc_ndep = args[3];

      if (isU32(cc_op, X86G_CC_OP_SUBL)) {
         /* C after sub denotes unsigned less than */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
      }
      if (isU32(cc_op, X86G_CC_OP_SUBB)) {
         /* C after sub denotes unsigned less than */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLT32U, 
                           binop(Iop_And32,cc_dep1,mkU32(0xFF)),
                           binop(Iop_And32,cc_dep2,mkU32(0xFF))));
      }
      if (isU32(cc_op, X86G_CC_OP_LOGICL)
          || isU32(cc_op, X86G_CC_OP_LOGICW)
          || isU32(cc_op, X86G_CC_OP_LOGICB)) {
         /* cflag after logic is zero */
         return mkU32(0);
      }
      if (isU32(cc_op, X86G_CC_OP_DECL) || isU32(cc_op, X86G_CC_OP_INCL)) {
         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
         return cc_ndep;
      }
      if (isU32(cc_op, X86G_CC_OP_COPY)) {
         /* cflag after COPY is stored in DEP1. */
         return
            binop(
               Iop_And32,
               binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
               mkU32(1)
            );
      }
      if (isU32(cc_op, X86G_CC_OP_ADDL)) {
         /* C after add denotes sum <u either arg */
         return unop(Iop_1Uto32,
                     binop(Iop_CmpLT32U, 
                           binop(Iop_Add32, cc_dep1, cc_dep2), 
                           cc_dep1));
      }
      // ATC, requires verification, no test case known
      //if (isU32(cc_op, X86G_CC_OP_SMULL)) {
      //   /* C after signed widening multiply denotes the case where
      //      the top half of the result isn't simply the sign extension
      //      of the bottom half (iow the result doesn't fit completely
      //      in the bottom half).  Hence: 
      //        C = hi-half(dep1 x dep2) != lo-half(dep1 x dep2) >>s 31 
      //      where 'x' denotes signed widening multiply.*/
      //   return 
      //      unop(Iop_1Uto32,
      //           binop(Iop_CmpNE32, 
      //                 unop(Iop_64HIto32,
      //                      binop(Iop_MullS32, cc_dep1, cc_dep2)),
      //                 binop(Iop_Sar32,
      //                       binop(Iop_Mul32, cc_dep1, cc_dep2), mkU8(31)) ));
      //}
#     if 0
      if (cc_op->tag == Iex_Const) {
         vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
      }
#     endif

      return NULL;
   }

   /* --------- specialising "x86g_calculate_eflags_all" --------- */

   if (vex_streq(function_name, "x86g_calculate_eflags_all")) {
      /* specialise calls to above "calculate_eflags_all" function */
      IRExpr *cc_op, *cc_dep1; /*, *cc_dep2, *cc_ndep; */
      vassert(arity == 4);
      cc_op   = args[0];
      cc_dep1 = args[1];
      /* cc_dep2 = args[2]; */
      /* cc_ndep = args[3]; */

      if (isU32(cc_op, X86G_CC_OP_COPY)) {
         /* eflags after COPY are stored in DEP1. */
         return
            binop(
               Iop_And32,
               cc_dep1,
               mkU32(X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z 
                     | X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P)
            );
      }
      return NULL;
   }

#  undef unop
#  undef binop
#  undef mkU32
#  undef mkU8

   return NULL;
}


/*---------------------------------------------------------------*/
/*--- Supporting functions for x87 FPU activities.            ---*/
/*---------------------------------------------------------------*/

static inline Bool host_is_little_endian ( void )
{
   UInt x = 0x76543210;
   UChar* p = (UChar*)(&x);
   return toBool(*p == 0x10);
}

/* 80 and 64-bit floating point formats:

   80-bit:

    S  0       0-------0      zero
    S  0       0X------X      denormals
    S  1-7FFE  1X------X      normals (all normals have leading 1)
    S  7FFF    10------0      infinity
    S  7FFF    10X-----X      snan
    S  7FFF    11X-----X      qnan

   S is the sign bit.  For runs X----X, at least one of the Xs must be
   nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
   there is an explicitly represented leading 1, and a sign bit,
   giving 80 in total.

   64-bit avoids the confusion of an explicitly represented leading 1
   and so is simpler:

    S  0      0------0   zero
    S  0      X------X   denormals
    S  1-7FE  any        normals
    S  7FF    0------0   infinity
    S  7FF    0X-----X   snan
    S  7FF    1X-----X   qnan

   Exponent is 11 bits, fractional part is 52 bits, and there is a 
   sign bit, giving 64 in total.
*/

/* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
/* CALLED FROM GENERATED CODE: CLEAN HELPER */
UInt x86g_calculate_FXAM ( UInt tag, ULong dbl ) 
{
   Bool   mantissaIsZero;
   Int    bexp;
   UChar  sign;
   UChar* f64;

   vassert(host_is_little_endian());

   /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */

   f64  = (UChar*)(&dbl);
   sign = toUChar( (f64[7] >> 7) & 1 );

   /* First off, if the tag indicates the register was empty,
      return 1,0,sign,1 */
   if (tag == 0) {
      /* vex_printf("Empty\n"); */
      return X86G_FC_MASK_C3 | 0 | (sign << X86G_FC_SHIFT_C1) 
                                 | X86G_FC_MASK_C0;
   }

   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
   bexp &= 0x7FF;

   mantissaIsZero
      = toBool(
           (f64[6] & 0x0F) == 0 
           && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
        );

   /* If both exponent and mantissa are zero, the value is zero.
      Return 1,0,sign,0. */
   if (bexp == 0 && mantissaIsZero) {
      /* vex_printf("Zero\n"); */
      return X86G_FC_MASK_C3 | 0 
                             | (sign << X86G_FC_SHIFT_C1) | 0;
   }
   
   /* If exponent is zero but mantissa isn't, it's a denormal.
      Return 1,1,sign,0. */
   if (bexp == 0 && !mantissaIsZero) {
      /* vex_printf("Denormal\n"); */
      return X86G_FC_MASK_C3 | X86G_FC_MASK_C2 
                             | (sign << X86G_FC_SHIFT_C1) | 0;
   }

   /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
      Return 0,1,sign,1. */
   if (bexp == 0x7FF && mantissaIsZero) {
      /* vex_printf("Inf\n"); */
      return 0 | X86G_FC_MASK_C2 | (sign << X86G_FC_SHIFT_C1) 
                                 | X86G_FC_MASK_C0;
   }

   /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
      Return 0,0,sign,1. */
   if (bexp == 0x7FF && !mantissaIsZero) {
      /* vex_printf("NaN\n"); */
      return 0 | 0 | (sign << X86G_FC_SHIFT_C1) | X86G_FC_MASK_C0;
   }

   /* Uh, ok, we give up.  It must be a normal finite number.
      Return 0,1,sign,0.
   */
   /* vex_printf("normal\n"); */
   return 0 | X86G_FC_MASK_C2 | (sign << X86G_FC_SHIFT_C1) | 0;
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (reads guest memory) */
ULong x86g_dirtyhelper_loadF80le ( Addr addrU )
{
   ULong f64;
   convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
   return f64;
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (writes guest memory) */
void x86g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
{
   convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
}


/*----------------------------------------------*/
/*--- The exported fns ..                    ---*/
/*----------------------------------------------*/

/* Layout of the real x87 state. */
/* 13 June 05: Fpu_State and auxiliary constants was moved to
   g_generic_x87.h */


/* CLEAN HELPER */
/* fpucw[15:0] contains a x87 native format FPU control word.
   Extract from it the required FPROUND value and any resulting
   emulation warning, and return (warn << 32) | fpround value. 
*/
ULong x86g_check_fldcw ( UInt fpucw )
{
   /* Decide on a rounding mode.  fpucw[11:10] holds it. */
   /* NOTE, encoded exactly as per enum IRRoundingMode. */
   UInt rmode = (fpucw >> 10) & 3;

   /* Detect any required emulation warnings. */
   VexEmNote ew = EmNote_NONE;

   if ((fpucw & 0x3F) != 0x3F) {
      /* unmasked exceptions! */
      ew = EmWarn_X86_x87exns;
   }
   else 
   if (((fpucw >> 8) & 3) != 3) {
      /* unsupported precision */
      ew = EmWarn_X86_x87precision;
   }

   return (((ULong)ew) << 32) | ((ULong)rmode);
}

/* CLEAN HELPER */
/* Given fpround as an IRRoundingMode value, create a suitable x87
   native format FPU control word. */
UInt x86g_create_fpucw ( UInt fpround )
{
   fpround &= 3;
   return 0x037F | (fpround << 10);
}


/* CLEAN HELPER */
/* mxcsr[15:0] contains a SSE native format MXCSR value.
   Extract from it the required SSEROUND value and any resulting
   emulation warning, and return (warn << 32) | sseround value.
*/
ULong x86g_check_ldmxcsr ( UInt mxcsr )
{
   /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
   /* NOTE, encoded exactly as per enum IRRoundingMode. */
   UInt rmode = (mxcsr >> 13) & 3;

   /* Detect any required emulation warnings. */
   VexEmNote ew = EmNote_NONE;

   if ((mxcsr & 0x1F80) != 0x1F80) {
      /* unmasked exceptions! */
      ew = EmWarn_X86_sseExns;
   }
   else 
   if (mxcsr & (1<<15)) {
      /* FZ is set */
      ew = EmWarn_X86_fz;
   } 
   else
   if (mxcsr & (1<<6)) {
      /* DAZ is set */
      ew = EmWarn_X86_daz;
   }

   return (((ULong)ew) << 32) | ((ULong)rmode);
}


/* CLEAN HELPER */
/* Given sseround as an IRRoundingMode value, create a suitable SSE
   native format MXCSR value. */
UInt x86g_create_mxcsr ( UInt sseround )
{
   sseround &= 3;
   return 0x1F80 | (sseround << 13);
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (writes guest state) */
/* Initialise the x87 FPU state as per 'finit'. */
void x86g_dirtyhelper_FINIT ( VexGuestX86State* gst )
{
   Int i;
   gst->guest_FTOP = 0;
   for (i = 0; i < 8; i++) {
      gst->guest_FPTAG[i] = 0; /* empty */
      gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
   }
   gst->guest_FPROUND = (UInt)Irrm_NEAREST;
   gst->guest_FC3210  = 0;
}


/* This is used to implement both 'frstor' and 'fldenv'.  The latter
   appears to differ from the former only in that the 8 FP registers
   themselves are not transferred into the guest state. */
static
VexEmNote do_put_x87 ( Bool moveRegs,
                       /*IN*/Fpu_State* x87_state,
                       /*OUT*/VexGuestX86State* vex_state )
{
   Int        stno, preg;
   UInt       tag;
   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   UInt       ftop    = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
   UInt       tagw    = x87_state->env[FP_ENV_TAG];
   UInt       fpucw   = x87_state->env[FP_ENV_CTRL];
   UInt       c3210   = x87_state->env[FP_ENV_STAT] & 0x4700;
   VexEmNote  ew;
   UInt       fpround;
   ULong      pair;

   /* Copy registers and tags */
   for (stno = 0; stno < 8; stno++) {
      preg = (stno + ftop) & 7;
      tag = (tagw >> (2*preg)) & 3;
      if (tag == 3) {
         /* register is empty */
         /* hmm, if it's empty, does it still get written?  Probably
            safer to say it does.  If we don't, memcheck could get out
            of sync, in that it thinks all FP registers are defined by
            this helper, but in reality some have not been updated. */
         if (moveRegs)
            vexRegs[preg] = 0; /* IEEE754 64-bit zero */
         vexTags[preg] = 0;
      } else {
         /* register is non-empty */
         if (moveRegs)
            convert_f80le_to_f64le( &x87_state->reg[10*stno], 
                                    (UChar*)&vexRegs[preg] );
         vexTags[preg] = 1;
      }
   }

   /* stack pointer */
   vex_state->guest_FTOP = ftop;

   /* status word */
   vex_state->guest_FC3210 = c3210;

   /* handle the control word, setting FPROUND and detecting any
      emulation warnings. */
   pair    = x86g_check_fldcw ( (UInt)fpucw );
   fpround = (UInt)pair;
   ew      = (VexEmNote)(pair >> 32);
   
   vex_state->guest_FPROUND = fpround & 3;

   /* emulation warnings --> caller */
   return ew;
}


/* Create an x87 FPU state from the guest state, as close as
   we can approximate it. */
static
void do_get_x87 ( /*IN*/VexGuestX86State* vex_state,
                  /*OUT*/Fpu_State* x87_state )
{
   Int        i, stno, preg;
   UInt       tagw;
   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
   UInt       ftop    = vex_state->guest_FTOP;
   UInt       c3210   = vex_state->guest_FC3210;

   for (i = 0; i < 14; i++)
      x87_state->env[i] = 0;

   x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
      = x87_state->env[13] = 0xFFFF;
   x87_state->env[FP_ENV_STAT] 
      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
   x87_state->env[FP_ENV_CTRL] 
      = toUShort(x86g_create_fpucw( vex_state->guest_FPROUND ));

   /* Dump the register stack in ST order. */
   tagw = 0;
   for (stno = 0; stno < 8; stno++) {
      preg = (stno + ftop) & 7;
      if (vexTags[preg] == 0) {
         /* register is empty */
         tagw |= (3 << (2*preg));
         convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 
                                 &x87_state->reg[10*stno] );
      } else {
         /* register is full. */
         tagw |= (0 << (2*preg));
         convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 
                                 &x87_state->reg[10*stno] );
      }
   }
   x87_state->env[FP_ENV_TAG] = toUShort(tagw);
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (reads guest state, writes guest mem) */
void x86g_dirtyhelper_FXSAVE ( VexGuestX86State* gst, HWord addr )
{
   /* Somewhat roundabout, but at least it's simple. */
   Fpu_State tmp;
   UShort*   addrS = (UShort*)addr;
   UChar*    addrC = (UChar*)addr;
   U128*     xmm   = (U128*)(addr + 160);
   UInt      mxcsr;
   UShort    fp_tags;
   UInt      summary_tags;
   Int       r, stno;
   UShort    *srcS, *dstS;

   do_get_x87( gst, &tmp );
   mxcsr = x86g_create_mxcsr( gst->guest_SSEROUND );

   /* Now build the proper fxsave image from the x87 image we just
      made. */

   addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
   addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */

   /* set addrS[2] in an endian-independent way */
   summary_tags = 0;
   fp_tags = tmp.env[FP_ENV_TAG];
   for (r = 0; r < 8; r++) {
      if ( ((fp_tags >> (2*r)) & 3) != 3 )
         summary_tags |= (1 << r);
   }
   addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
   addrC[5]  = 0; /* pad */

   addrS[3]  = 0; /* FOP: fpu opcode (bogus) */
   addrS[4]  = 0;
   addrS[5]  = 0; /* FPU IP (bogus) */
   addrS[6]  = 0; /* FPU IP's segment selector (bogus) (although we
                     could conceivably dump %CS here) */

   addrS[7]  = 0; /* Intel reserved */

   addrS[8]  = 0; /* FPU DP (operand pointer) (bogus) */
   addrS[9]  = 0; /* FPU DP (operand pointer) (bogus) */
   addrS[10] = 0; /* segment selector for above operand pointer; %DS
                     perhaps? */
   addrS[11] = 0; /* Intel reserved */

   addrS[12] = toUShort(mxcsr);  /* MXCSR */
   addrS[13] = toUShort(mxcsr >> 16);

   addrS[14] = 0xFFFF; /* MXCSR mask (lo16); who knows what for */
   addrS[15] = 0xFFFF; /* MXCSR mask (hi16); who knows what for */

   /* Copy in the FP registers, in ST order. */
   for (stno = 0; stno < 8; stno++) {
      srcS = (UShort*)(&tmp.reg[10*stno]);
      dstS = (UShort*)(&addrS[16 + 8*stno]);
      dstS[0] = srcS[0];
      dstS[1] = srcS[1];
      dstS[2] = srcS[2];
      dstS[3] = srcS[3];
      dstS[4] = srcS[4];
      dstS[5] = 0;
      dstS[6] = 0;
      dstS[7] = 0;
   }

   /* That's the first 160 bytes of the image done.  Now only %xmm0
      .. %xmm7 remain to be copied.  If the host is big-endian, these
      need to be byte-swapped. */
   vassert(host_is_little_endian());

#  define COPY_U128(_dst,_src)                       \
      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
      while (0)

   COPY_U128( xmm[0], gst->guest_XMM0 );
   COPY_U128( xmm[1], gst->guest_XMM1 );
   COPY_U128( xmm[2], gst->guest_XMM2 );
   COPY_U128( xmm[3], gst->guest_XMM3 );
   COPY_U128( xmm[4], gst->guest_XMM4 );
   COPY_U128( xmm[5], gst->guest_XMM5 );
   COPY_U128( xmm[6], gst->guest_XMM6 );
   COPY_U128( xmm[7], gst->guest_XMM7 );

#  undef COPY_U128
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (writes guest state, reads guest mem) */
VexEmNote x86g_dirtyhelper_FXRSTOR ( VexGuestX86State* gst, HWord addr )
{
   Fpu_State tmp;
   VexEmNote warnX87 = EmNote_NONE;
   VexEmNote warnXMM = EmNote_NONE;
   UShort*   addrS   = (UShort*)addr;
   UChar*    addrC   = (UChar*)addr;
   U128*     xmm     = (U128*)(addr + 160);
   UShort    fp_tags;
   Int       r, stno, i;

   /* Restore %xmm0 .. %xmm7.  If the host is big-endian, these need
      to be byte-swapped. */
   vassert(host_is_little_endian());

#  define COPY_U128(_dst,_src)                       \
      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
      while (0)

   COPY_U128( gst->guest_XMM0, xmm[0] );
   COPY_U128( gst->guest_XMM1, xmm[1] );
   COPY_U128( gst->guest_XMM2, xmm[2] );
   COPY_U128( gst->guest_XMM3, xmm[3] );
   COPY_U128( gst->guest_XMM4, xmm[4] );
   COPY_U128( gst->guest_XMM5, xmm[5] );
   COPY_U128( gst->guest_XMM6, xmm[6] );
   COPY_U128( gst->guest_XMM7, xmm[7] );

#  undef COPY_U128

   /* Copy the x87 registers out of the image, into a temporary
      Fpu_State struct. */

   /* LLVM on Darwin turns the following loop into a movaps plus a
      handful of scalar stores.  This would work fine except for the
      fact that VEX doesn't keep the stack correctly (16-) aligned for
      the call, so it segfaults.  Hence, split the loop into two
      pieces (and pray LLVM doesn't merely glue them back together) so
      it's composed only of scalar stores and so is alignment
      insensitive.  Of course this is a kludge of the lamest kind --
      VEX should be fixed properly. */
   /* Code that seems to trigger the problem:
      for (i = 0; i < 14; i++) tmp.env[i] = 0; */
   for (i = 0; i < 7; i++) tmp.env[i+0] = 0;
   __asm__ __volatile__("" ::: "memory");
   for (i = 0; i < 7; i++) tmp.env[i+7] = 0;
   
   for (i = 0; i < 80; i++) tmp.reg[i] = 0;
   /* fill in tmp.reg[0..7] */
   for (stno = 0; stno < 8; stno++) {
      UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
      UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
      dstS[0] = srcS[0];
      dstS[1] = srcS[1];
      dstS[2] = srcS[2];
      dstS[3] = srcS[3];
      dstS[4] = srcS[4];
   }
   /* fill in tmp.env[0..13] */
   tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
   tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */

   fp_tags = 0;
   for (r = 0; r < 8; r++) {
      if (addrC[4] & (1<<r))
         fp_tags |= (0 << (2*r)); /* EMPTY */
      else 
         fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
   }
   tmp.env[FP_ENV_TAG] = fp_tags;

   /* Now write 'tmp' into the guest state. */
   warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );

   { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
                | ((((UInt)addrS[13]) & 0xFFFF) << 16);
     ULong w64 = x86g_check_ldmxcsr( w32 );

     warnXMM = (VexEmNote)(w64 >> 32);

     gst->guest_SSEROUND = w64 & 0xFFFFFFFF;
   }

   /* Prefer an X87 emwarn over an XMM one, if both exist. */
   if (warnX87 != EmNote_NONE)
      return warnX87;
   else
      return warnXMM;
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (reads guest state, writes guest mem) */
void x86g_dirtyhelper_FSAVE ( VexGuestX86State* gst, HWord addr )
{
   do_get_x87( gst, (Fpu_State*)addr );
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (writes guest state, reads guest mem) */
VexEmNote x86g_dirtyhelper_FRSTOR ( VexGuestX86State* gst, HWord addr )
{
   return do_put_x87( True/*regs too*/, (Fpu_State*)addr, gst );
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (reads guest state, writes guest mem) */
void x86g_dirtyhelper_FSTENV ( VexGuestX86State* gst, HWord addr )
{
   /* Somewhat roundabout, but at least it's simple. */
   Int       i;
   UShort*   addrP = (UShort*)addr;
   Fpu_State tmp;
   do_get_x87( gst, &tmp );
   for (i = 0; i < 14; i++)
      addrP[i] = tmp.env[i];
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (writes guest state, reads guest mem) */
VexEmNote x86g_dirtyhelper_FLDENV ( VexGuestX86State* gst, HWord addr )
{
   return do_put_x87( False/*don't move regs*/, (Fpu_State*)addr, gst);
}

/* VISIBLE TO LIBVEX CLIENT */
/* Do x87 save from the supplied VexGuestX86State structure and store the
   result at the given address which represents a buffer of at least 108
   bytes. */
void LibVEX_GuestX86_get_x87 ( /*IN*/VexGuestX86State* vex_state,
                               /*OUT*/UChar* x87_state )
{
   do_get_x87 ( vex_state, (Fpu_State*)x87_state );
}

/* VISIBLE TO LIBVEX CLIENT */
/* Do x87 restore from the supplied address and store read values to the given
   VexGuestX86State structure. */
VexEmNote LibVEX_GuestX86_put_x87 ( /*IN*/UChar* x87_state,
                                    /*MOD*/VexGuestX86State* vex_state )
{
   return do_put_x87 ( True/*moveRegs*/, (Fpu_State*)x87_state, vex_state );
}

/* VISIBLE TO LIBVEX CLIENT */
/* Return mxcsr from the supplied VexGuestX86State structure. */
UInt LibVEX_GuestX86_get_mxcsr ( /*IN*/VexGuestX86State* vex_state )
{
   return x86g_create_mxcsr ( vex_state->guest_SSEROUND );
}

/* VISIBLE TO LIBVEX CLIENT */
/* Modify the given VexGuestX86State structure according to the passed mxcsr
   value. */
VexEmNote LibVEX_GuestX86_put_mxcsr ( /*IN*/UInt mxcsr,
                                      /*MOD*/VexGuestX86State* vex_state)
{
   ULong w64 = x86g_check_ldmxcsr( mxcsr );
   vex_state->guest_SSEROUND = w64 & 0xFFFFFFFF;
   return (VexEmNote)(w64 >> 32);
}

/*---------------------------------------------------------------*/
/*--- Misc integer helpers, including rotates and CPUID.      ---*/
/*---------------------------------------------------------------*/

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate both flags and value result for rotate right
   through the carry bit.  Result in low 32 bits, 
   new flags (OSZACP) in high 32 bits.
*/
ULong x86g_calculate_RCR ( UInt arg, UInt rot_amt, UInt eflags_in, UInt sz )
{
   UInt tempCOUNT = rot_amt & 0x1F, cf=0, of=0, tempcf;

   switch (sz) {
      case 4:
         cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
         of        = ((arg >> 31) ^ cf) & 1;
         while (tempCOUNT > 0) {
            tempcf = arg & 1;
            arg    = (arg >> 1) | (cf << 31);
            cf     = tempcf;
            tempCOUNT--;
         }
         break;
      case 2:
         while (tempCOUNT >= 17) tempCOUNT -= 17;
         cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
         of        = ((arg >> 15) ^ cf) & 1;
         while (tempCOUNT > 0) {
            tempcf = arg & 1;
            arg    = ((arg >> 1) & 0x7FFF) | (cf << 15);
            cf     = tempcf;
            tempCOUNT--;
         }
         break;
      case 1:
         while (tempCOUNT >= 9) tempCOUNT -= 9;
         cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
         of        = ((arg >> 7) ^ cf) & 1;
         while (tempCOUNT > 0) {
            tempcf = arg & 1;
            arg    = ((arg >> 1) & 0x7F) | (cf << 7);
            cf     = tempcf;
            tempCOUNT--;
         }
         break;
      default: 
         vpanic("calculate_RCR: invalid size");
   }

   cf &= 1;
   of &= 1;
   eflags_in &= ~(X86G_CC_MASK_C | X86G_CC_MASK_O);
   eflags_in |= (cf << X86G_CC_SHIFT_C) | (of << X86G_CC_SHIFT_O);

   return (((ULong)eflags_in) << 32) | ((ULong)arg);
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate both flags and value result for rotate left
   through the carry bit.  Result in low 32 bits, 
   new flags (OSZACP) in high 32 bits.
*/
ULong x86g_calculate_RCL ( UInt arg, UInt rot_amt, UInt eflags_in, UInt sz )
{
   UInt tempCOUNT = rot_amt & 0x1F, cf=0, of=0, tempcf;

   switch (sz) {
      case 4:
         cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
         while (tempCOUNT > 0) {
            tempcf = (arg >> 31) & 1;
            arg    = (arg << 1) | (cf & 1);
            cf     = tempcf;
            tempCOUNT--;
         }
         of = ((arg >> 31) ^ cf) & 1;
         break;
      case 2:
         while (tempCOUNT >= 17) tempCOUNT -= 17;
         cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
         while (tempCOUNT > 0) {
            tempcf = (arg >> 15) & 1;
            arg    = 0xFFFF & ((arg << 1) | (cf & 1));
            cf     = tempcf;
            tempCOUNT--;
         }
         of = ((arg >> 15) ^ cf) & 1;
         break;
      case 1:
         while (tempCOUNT >= 9) tempCOUNT -= 9;
         cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
         while (tempCOUNT > 0) {
            tempcf = (arg >> 7) & 1;
            arg    = 0xFF & ((arg << 1) | (cf & 1));
            cf     = tempcf;
            tempCOUNT--;
         }
         of = ((arg >> 7) ^ cf) & 1;
         break;
      default: 
         vpanic("calculate_RCL: invalid size");
   }

   cf &= 1;
   of &= 1;
   eflags_in &= ~(X86G_CC_MASK_C | X86G_CC_MASK_O);
   eflags_in |= (cf << X86G_CC_SHIFT_C) | (of << X86G_CC_SHIFT_O);

   return (((ULong)eflags_in) << 32) | ((ULong)arg);
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate both flags and value result for DAA/DAS/AAA/AAS.
   AX value in low half of arg, OSZACP in upper half.
   See guest-x86/toIR.c usage point for details.
*/
static UInt calc_parity_8bit ( UInt w32 ) {
   UInt i;
   UInt p = 1;
   for (i = 0; i < 8; i++)
      p ^= (1 & (w32 >> i));
   return p;
}
UInt x86g_calculate_daa_das_aaa_aas ( UInt flags_and_AX, UInt opcode )
{
   UInt r_AL = (flags_and_AX >> 0) & 0xFF;
   UInt r_AH = (flags_and_AX >> 8) & 0xFF;
   UInt r_O  = (flags_and_AX >> (16 + X86G_CC_SHIFT_O)) & 1;
   UInt r_S  = (flags_and_AX >> (16 + X86G_CC_SHIFT_S)) & 1;
   UInt r_Z  = (flags_and_AX >> (16 + X86G_CC_SHIFT_Z)) & 1;
   UInt r_A  = (flags_and_AX >> (16 + X86G_CC_SHIFT_A)) & 1;
   UInt r_C  = (flags_and_AX >> (16 + X86G_CC_SHIFT_C)) & 1;
   UInt r_P  = (flags_and_AX >> (16 + X86G_CC_SHIFT_P)) & 1;
   UInt result = 0;

   switch (opcode) {
      case 0x27: { /* DAA */
         UInt old_AL = r_AL;
         UInt old_C  = r_C;
         r_C = 0;
         if ((r_AL & 0xF) > 9 || r_A == 1) {
            r_AL = r_AL + 6;
            r_C  = old_C;
            if (r_AL >= 0x100) r_C = 1;
            r_A = 1;
         } else {
            r_A = 0;
         }
         if (old_AL > 0x99 || old_C == 1) {
            r_AL = r_AL + 0x60;
            r_C  = 1;
         } else {
            r_C = 0;
         }
         /* O is undefined.  S Z and P are set according to the
	    result. */
         r_AL &= 0xFF;
         r_O = 0; /* let's say */
         r_S = (r_AL & 0x80) ? 1 : 0;
         r_Z = (r_AL == 0) ? 1 : 0;
         r_P = calc_parity_8bit( r_AL );
         break;
      }
      case 0x2F: { /* DAS */
         UInt old_AL = r_AL;
         UInt old_C  = r_C;
         r_C = 0;
         if ((r_AL & 0xF) > 9 || r_A == 1) {
            Bool borrow = r_AL < 6;
            r_AL = r_AL - 6;
            r_C  = old_C;
            if (borrow) r_C = 1;
            r_A = 1;
         } else {
            r_A = 0;
         }
         if (old_AL > 0x99 || old_C == 1) {
            r_AL = r_AL - 0x60;
            r_C  = 1;
         } else {
            /* Intel docs are wrong: r_C = 0; */
         }
         /* O is undefined.  S Z and P are set according to the
	    result. */
         r_AL &= 0xFF;
         r_O = 0; /* let's say */
         r_S = (r_AL & 0x80) ? 1 : 0;
         r_Z = (r_AL == 0) ? 1 : 0;
         r_P = calc_parity_8bit( r_AL );
         break;
      }
      case 0x37: { /* AAA */
         Bool nudge = r_AL > 0xF9;
         if ((r_AL & 0xF) > 9 || r_A == 1) {
            r_AL = r_AL + 6;
            r_AH = r_AH + 1 + (nudge ? 1 : 0);
            r_A  = 1;
            r_C  = 1;
            r_AL = r_AL & 0xF;
         } else {
            r_A  = 0;
            r_C  = 0;
            r_AL = r_AL & 0xF;
         }
         /* O S Z and P are undefined. */
         r_O = r_S = r_Z = r_P = 0; /* let's say */
         break;
      }
      case 0x3F: { /* AAS */
         Bool nudge = r_AL < 0x06;
         if ((r_AL & 0xF) > 9 || r_A == 1) {
            r_AL = r_AL - 6;
            r_AH = r_AH - 1 - (nudge ? 1 : 0);
            r_A  = 1;
            r_C  = 1;
            r_AL = r_AL & 0xF;
         } else {
            r_A  = 0;
            r_C  = 0;
            r_AL = r_AL & 0xF;
         }
         /* O S Z and P are undefined. */
         r_O = r_S = r_Z = r_P = 0; /* let's say */
         break;
      }
      default:
         vassert(0);
   }
   result =   ( (r_O & 1) << (16 + X86G_CC_SHIFT_O) )
            | ( (r_S & 1) << (16 + X86G_CC_SHIFT_S) )
            | ( (r_Z & 1) << (16 + X86G_CC_SHIFT_Z) )
            | ( (r_A & 1) << (16 + X86G_CC_SHIFT_A) )
            | ( (r_C & 1) << (16 + X86G_CC_SHIFT_C) )
            | ( (r_P & 1) << (16 + X86G_CC_SHIFT_P) )
            | ( (r_AH & 0xFF) << 8 )
            | ( (r_AL & 0xFF) << 0 );
   return result;
}

UInt x86g_calculate_aad_aam ( UInt flags_and_AX, UInt opcode )
{
   UInt r_AL = (flags_and_AX >> 0) & 0xFF;
   UInt r_AH = (flags_and_AX >> 8) & 0xFF;
   UInt r_O  = (flags_and_AX >> (16 + X86G_CC_SHIFT_O)) & 1;
   UInt r_S  = (flags_and_AX >> (16 + X86G_CC_SHIFT_S)) & 1;
   UInt r_Z  = (flags_and_AX >> (16 + X86G_CC_SHIFT_Z)) & 1;
   UInt r_A  = (flags_and_AX >> (16 + X86G_CC_SHIFT_A)) & 1;
   UInt r_C  = (flags_and_AX >> (16 + X86G_CC_SHIFT_C)) & 1;
   UInt r_P  = (flags_and_AX >> (16 + X86G_CC_SHIFT_P)) & 1;
   UInt result = 0;

   switch (opcode) {
      case 0xD4: { /* AAM */
         r_AH = r_AL / 10;
         r_AL = r_AL % 10;
         break;
      }
      case 0xD5: { /* AAD */
         r_AL = ((r_AH * 10) + r_AL) & 0xff;
         r_AH = 0;
         break;
      }
      default:
         vassert(0);
   }

   r_O = 0; /* let's say (undefined) */
   r_C = 0; /* let's say (undefined) */
   r_A = 0; /* let's say (undefined) */
   r_S = (r_AL & 0x80) ? 1 : 0;
   r_Z = (r_AL == 0) ? 1 : 0;
   r_P = calc_parity_8bit( r_AL );

   result =   ( (r_O & 1) << (16 + X86G_CC_SHIFT_O) )
            | ( (r_S & 1) << (16 + X86G_CC_SHIFT_S) )
            | ( (r_Z & 1) << (16 + X86G_CC_SHIFT_Z) )
            | ( (r_A & 1) << (16 + X86G_CC_SHIFT_A) )
            | ( (r_C & 1) << (16 + X86G_CC_SHIFT_C) )
            | ( (r_P & 1) << (16 + X86G_CC_SHIFT_P) )
            | ( (r_AH & 0xFF) << 8 )
            | ( (r_AL & 0xFF) << 0 );
   return result;
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-x86 platforms, return 1. */
ULong x86g_dirtyhelper_RDTSC ( void )
{
#  if defined(__i386__)
   ULong res;
   __asm__ __volatile__("rdtsc" : "=A" (res));
   return res;
#  else
   return 1ULL;
#  endif
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (modifies guest state) */
/* Claim to be a P55C (Intel Pentium/MMX) */
void x86g_dirtyhelper_CPUID_sse0 ( VexGuestX86State* st )
{
   switch (st->guest_EAX) {
      case 0: 
         st->guest_EAX = 0x1;
         st->guest_EBX = 0x756e6547;
         st->guest_ECX = 0x6c65746e;
         st->guest_EDX = 0x49656e69;
         break;
      default:
         st->guest_EAX = 0x543;
         st->guest_EBX = 0x0;
         st->guest_ECX = 0x0;
         st->guest_EDX = 0x8001bf;
         break;
   }
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (modifies guest state) */
/* Claim to be a Athlon "Classic" (Model 2, K75 "Pluto/Orion") */
/* But without 3DNow support (weird, but we really don't support it). */
void x86g_dirtyhelper_CPUID_mmxext ( VexGuestX86State* st )
{
   switch (st->guest_EAX) {
      /* vendor ID */
      case 0:
         st->guest_EAX = 0x1;
         st->guest_EBX = 0x68747541;
         st->guest_ECX = 0x444d4163;
         st->guest_EDX = 0x69746e65;
         break;
      /* feature bits */
      case 1:
         st->guest_EAX = 0x621;
         st->guest_EBX = 0x0;
         st->guest_ECX = 0x0;
         st->guest_EDX = 0x183f9ff;
         break;
      /* Highest Extended Function Supported (0x80000004 brand string) */
      case 0x80000000:
         st->guest_EAX = 0x80000004;
         st->guest_EBX = 0x68747541;
         st->guest_ECX = 0x444d4163;
         st->guest_EDX = 0x69746e65;
         break;
      /* Extended Processor Info and Feature Bits */
      case 0x80000001:
         st->guest_EAX = 0x721;
         st->guest_EBX = 0x0;
         st->guest_ECX = 0x0;
         st->guest_EDX = 0x1c3f9ff; /* Note no 3DNow. */
         break;
      /* Processor Brand String "AMD Athlon(tm) Processor" */
      case 0x80000002:
         st->guest_EAX = 0x20444d41;
         st->guest_EBX = 0x6c687441;
         st->guest_ECX = 0x74286e6f;
         st->guest_EDX = 0x5020296d;
         break;
      case 0x80000003:
         st->guest_EAX = 0x65636f72;
         st->guest_EBX = 0x726f7373;
         st->guest_ECX = 0x0;
         st->guest_EDX = 0x0;
         break;
      default:
         st->guest_EAX = 0x0;
         st->guest_EBX = 0x0;
         st->guest_ECX = 0x0;
         st->guest_EDX = 0x0;
         break;
   }
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (modifies guest state) */
/* Claim to be the following SSE1-capable CPU:
   vendor_id       : GenuineIntel
   cpu family      : 6
   model           : 11
   model name      : Intel(R) Pentium(R) III CPU family      1133MHz
   stepping        : 1
   cpu MHz         : 1131.013
   cache size      : 512 KB
*/
void x86g_dirtyhelper_CPUID_sse1 ( VexGuestX86State* st )
{
   switch (st->guest_EAX) {
      case 0: 
         st->guest_EAX = 0x00000002;
         st->guest_EBX = 0x756e6547;
         st->guest_ECX = 0x6c65746e;
         st->guest_EDX = 0x49656e69;
         break;
      case 1: 
         st->guest_EAX = 0x000006b1;
         st->guest_EBX = 0x00000004;
         st->guest_ECX = 0x00000000;
         st->guest_EDX = 0x0383fbff;
         break;
      default:
         st->guest_EAX = 0x03020101;
         st->guest_EBX = 0x00000000;
         st->guest_ECX = 0x00000000;
         st->guest_EDX = 0x0c040883;
         break;
   }
}

/* Claim to be the following SSE2-capable CPU:
   vendor_id    : GenuineIntel
   cpu family   : 15
   model        : 2
   model name   : Intel(R) Pentium(R) 4 CPU 3.00GHz
   stepping     : 9
   microcode    : 0x17
   cpu MHz      : 2992.577
   cache size   : 512 KB
   flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
                  pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe
                   pebs bts cid xtpr
   clflush size : 64
   cache_alignment : 128
   address sizes : 36 bits physical, 32 bits virtual
*/
void x86g_dirtyhelper_CPUID_sse2 ( VexGuestX86State* st )
{
   switch (st->guest_EAX) {
      case 0: 
         st->guest_EAX = 0x00000002;
         st->guest_EBX = 0x756e6547;
         st->guest_ECX = 0x6c65746e;
         st->guest_EDX = 0x49656e69;
         break;
      case 1: 
         st->guest_EAX = 0x00000f29;
         st->guest_EBX = 0x01020809;
         st->guest_ECX = 0x00004400;
         st->guest_EDX = 0xbfebfbff;
         break;
      default:
         st->guest_EAX = 0x03020101;
         st->guest_EBX = 0x00000000;
         st->guest_ECX = 0x00000000;
         st->guest_EDX = 0x0c040883;
         break;
   }
}

/* Claim to be the following SSSE3-capable CPU (2 x ...):
   vendor_id       : GenuineIntel
   cpu family      : 6
   model           : 15
   model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
   stepping        : 6
   cpu MHz         : 2394.000
   cache size      : 4096 KB
   physical id     : 0
   siblings        : 2
   core id         : 0
   cpu cores       : 2
   fpu             : yes
   fpu_exception   : yes
   cpuid level     : 10
   wp              : yes
   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
                     mtrr pge mca cmov pat pse36 clflush dts acpi
                     mmx fxsr sse sse2 ss ht tm syscall nx lm
                     constant_tsc pni monitor ds_cpl vmx est tm2
                     cx16 xtpr lahf_lm
   bogomips        : 4798.78
   clflush size    : 64
   cache_alignment : 64
   address sizes   : 36 bits physical, 48 bits virtual
   power management:
*/
void x86g_dirtyhelper_CPUID_sse3 ( VexGuestX86State* st )
{
#  define SET_ABCD(_a,_b,_c,_d)               \
      do { st->guest_EAX = (UInt)(_a);        \
           st->guest_EBX = (UInt)(_b);        \
           st->guest_ECX = (UInt)(_c);        \
           st->guest_EDX = (UInt)(_d);        \
      } while (0)

   switch (st->guest_EAX) {
      case 0x00000000:
         SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
         break;
      case 0x00000001:
         SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
         break;
      case 0x00000002:
         SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
         break;
      case 0x00000003:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000004: {
         switch (st->guest_ECX) {
            case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
                                      0x0000003f, 0x00000001); break;
            case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
                                      0x0000003f, 0x00000001); break;
            case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
                                      0x00000fff, 0x00000001); break;
            default:         SET_ABCD(0x00000000, 0x00000000,
                                      0x00000000, 0x00000000); break;
         }
         break;
      }
      case 0x00000005:
         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
         break;
      case 0x00000006:
         SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
         break;
      case 0x00000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000008:
         SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x00000009:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x0000000a:
      unhandled_eax_value:
         SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000000:
         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000001:
         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100000);
         break;
      case 0x80000002:
         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
         break;
      case 0x80000003:
         SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
         break;
      case 0x80000004:
         SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
         break;
      case 0x80000005:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000006:
         SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
         break;
      case 0x80000007:
         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
         break;
      case 0x80000008:
         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
         break;
      default:
         goto unhandled_eax_value;
   }
#  undef SET_ABCD
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-x86 platforms, return 0. */
UInt x86g_dirtyhelper_IN ( UInt portno, UInt sz/*1,2 or 4*/ )
{
#  if defined(__i386__)
   UInt r = 0;
   portno &= 0xFFFF;
   switch (sz) {
      case 4: 
         __asm__ __volatile__("movl $0,%%eax; inl %w1,%0" 
                              : "=a" (r) : "Nd" (portno));
	 break;
      case 2: 
         __asm__ __volatile__("movl $0,%%eax; inw %w1,%w0" 
                              : "=a" (r) : "Nd" (portno));
	 break;
      case 1: 
         __asm__ __volatile__("movl $0,%%eax; inb %w1,%b0" 
                              : "=a" (r) : "Nd" (portno));
	 break;
      default:
         break;
   }
   return r;
#  else
   return 0;
#  endif
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-x86 platforms, do nothing. */
void x86g_dirtyhelper_OUT ( UInt portno, UInt data, UInt sz/*1,2 or 4*/ )
{
#  if defined(__i386__)
   portno &= 0xFFFF;
   switch (sz) {
      case 4: 
         __asm__ __volatile__("outl %0, %w1" 
                              : : "a" (data), "Nd" (portno));
	 break;
      case 2: 
         __asm__ __volatile__("outw %w0, %w1" 
                              : : "a" (data), "Nd" (portno));
	 break;
      case 1: 
         __asm__ __volatile__("outb %b0, %w1" 
                              : : "a" (data), "Nd" (portno));
	 break;
      default:
         break;
   }
#  else
   /* do nothing */
#  endif
}

/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-x86 platforms, do nothing. */
/* op = 0: call the native SGDT instruction.
   op = 1: call the native SIDT instruction.
*/
void x86g_dirtyhelper_SxDT ( void *address, UInt op ) {
#  if defined(__i386__)
   switch (op) {
      case 0:
         __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
         break;
      case 1:
         __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
         break;
      default:
         vpanic("x86g_dirtyhelper_SxDT");
   }
#  else
   /* do nothing */
   UChar* p = (UChar*)address;
   p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
#  endif
}

/*---------------------------------------------------------------*/
/*--- Helpers for MMX/SSE/SSE2.                               ---*/
/*---------------------------------------------------------------*/

static inline UChar abdU8 ( UChar xx, UChar yy ) {
   return toUChar(xx>yy ? xx-yy : yy-xx);
}

static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
   return (((ULong)w1) << 32) | ((ULong)w0);
}

static inline UShort sel16x4_3 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUShort(hi32 >> 16);
}
static inline UShort sel16x4_2 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUShort(hi32);
}
static inline UShort sel16x4_1 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUShort(lo32 >> 16);
}
static inline UShort sel16x4_0 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUShort(lo32);
}

static inline UChar sel8x8_7 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUChar(hi32 >> 24);
}
static inline UChar sel8x8_6 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUChar(hi32 >> 16);
}
static inline UChar sel8x8_5 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUChar(hi32 >> 8);
}
static inline UChar sel8x8_4 ( ULong w64 ) {
   UInt hi32 = toUInt(w64 >> 32);
   return toUChar(hi32 >> 0);
}
static inline UChar sel8x8_3 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUChar(lo32 >> 24);
}
static inline UChar sel8x8_2 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUChar(lo32 >> 16);
}
static inline UChar sel8x8_1 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUChar(lo32 >> 8);
}
static inline UChar sel8x8_0 ( ULong w64 ) {
   UInt lo32 = toUInt(w64);
   return toUChar(lo32 >> 0);
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong x86g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
{
   return
      mk32x2( 
         (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
            + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
         (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
            + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
      );
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong x86g_calculate_mmx_psadbw ( ULong xx, ULong yy )
{
   UInt t = 0;
   t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
   t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
   t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
   t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
   t &= 0xFFFF;
   return (ULong)t;
}


/*---------------------------------------------------------------*/
/*--- Helpers for dealing with segment overrides.             ---*/
/*---------------------------------------------------------------*/

static inline 
UInt get_segdescr_base ( VexGuestX86SegDescr* ent )
{
   UInt lo  = 0xFFFF & (UInt)ent->LdtEnt.Bits.BaseLow;
   UInt mid =   0xFF & (UInt)ent->LdtEnt.Bits.BaseMid;
   UInt hi  =   0xFF & (UInt)ent->LdtEnt.Bits.BaseHi;
   return (hi << 24) | (mid << 16) | lo;
}

static inline
UInt get_segdescr_limit ( VexGuestX86SegDescr* ent )
{
    UInt lo    = 0xFFFF & (UInt)ent->LdtEnt.Bits.LimitLow;
    UInt hi    =    0xF & (UInt)ent->LdtEnt.Bits.LimitHi;
    UInt limit = (hi << 16) | lo;
    if (ent->LdtEnt.Bits.Granularity) 
       limit = (limit << 12) | 0xFFF;
    return limit;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
                              UInt seg_selector, UInt virtual_addr )
{
   UInt tiBit, base, limit;
   VexGuestX86SegDescr* the_descrs;

   Bool verboze = False;

   /* If this isn't true, we're in Big Trouble. */
   vassert(8 == sizeof(VexGuestX86SegDescr));

   if (verboze) 
      vex_printf("x86h_use_seg_selector: "
                 "seg_selector = 0x%x, vaddr = 0x%x\n", 
                 seg_selector, virtual_addr);

   /* Check for wildly invalid selector. */
   if (seg_selector & ~0xFFFF)
      goto bad;

   seg_selector &= 0x0000FFFF;
  
   /* Sanity check the segment selector.  Ensure that RPL=11b (least
      privilege).  This forms the bottom 2 bits of the selector. */
   if ((seg_selector & 3) != 3)
      goto bad;

   /* Extract the TI bit (0 means GDT, 1 means LDT) */
   tiBit = (seg_selector >> 2) & 1;

   /* Convert the segment selector onto a table index */
   seg_selector >>= 3;
   vassert(seg_selector >= 0 && seg_selector < 8192);

   if (tiBit == 0) {

      /* GDT access. */
      /* Do we actually have a GDT to look at? */
      if (gdt == 0)
         goto bad;

      /* Check for access to non-existent entry. */
      if (seg_selector >= VEX_GUEST_X86_GDT_NENT)
         goto bad;

      the_descrs = (VexGuestX86SegDescr*)gdt;
      base  = get_segdescr_base (&the_descrs[seg_selector]);
      limit = get_segdescr_limit(&the_descrs[seg_selector]);

   } else {

      /* All the same stuff, except for the LDT. */
      if (ldt == 0)
         goto bad;

      if (seg_selector >= VEX_GUEST_X86_LDT_NENT)
         goto bad;

      the_descrs = (VexGuestX86SegDescr*)ldt;
      base  = get_segdescr_base (&the_descrs[seg_selector]);
      limit = get_segdescr_limit(&the_descrs[seg_selector]);

   }

   /* Do the limit check.  Note, this check is just slightly too
      slack.  Really it should be "if (virtual_addr + size - 1 >=
      limit)," but we don't have the size info to hand.  Getting it
      could be significantly complex.  */
   if (virtual_addr >= limit)
      goto bad;

   if (verboze) 
      vex_printf("x86h_use_seg_selector: "
                 "base = 0x%x, addr = 0x%x\n", 
                 base, base + virtual_addr);

   /* High 32 bits are zero, indicating success. */
   return (ULong)( ((UInt)virtual_addr) + base );

 bad:
   return 1ULL << 32;
}


/*---------------------------------------------------------------*/
/*--- Helpers for dealing with, and describing,               ---*/
/*--- guest state as a whole.                                 ---*/
/*---------------------------------------------------------------*/

/* Initialise the entire x86 guest state. */
/* VISIBLE TO LIBVEX CLIENT */
void LibVEX_GuestX86_initialise ( /*OUT*/VexGuestX86State* vex_state )
{
   vex_state->host_EvC_FAILADDR = 0;
   vex_state->host_EvC_COUNTER = 0;

   vex_state->guest_EAX = 0;
   vex_state->guest_ECX = 0;
   vex_state->guest_EDX = 0;
   vex_state->guest_EBX = 0;
   vex_state->guest_ESP = 0;
   vex_state->guest_EBP = 0;
   vex_state->guest_ESI = 0;
   vex_state->guest_EDI = 0;

   vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
   vex_state->guest_CC_DEP1 = 0;
   vex_state->guest_CC_DEP2 = 0;
   vex_state->guest_CC_NDEP = 0;
   vex_state->guest_DFLAG   = 1; /* forwards */
   vex_state->guest_IDFLAG  = 0;
   vex_state->guest_ACFLAG  = 0;

   vex_state->guest_EIP = 0;

   /* Initialise the simulated FPU */
   x86g_dirtyhelper_FINIT( vex_state );

   /* Initialse the SSE state. */
#  define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0;

   vex_state->guest_SSEROUND = (UInt)Irrm_NEAREST;
   SSEZERO(vex_state->guest_XMM0);
   SSEZERO(vex_state->guest_XMM1);
   SSEZERO(vex_state->guest_XMM2);
   SSEZERO(vex_state->guest_XMM3);
   SSEZERO(vex_state->guest_XMM4);
   SSEZERO(vex_state->guest_XMM5);
   SSEZERO(vex_state->guest_XMM6);
   SSEZERO(vex_state->guest_XMM7);

#  undef SSEZERO

   vex_state->guest_CS  = 0;
   vex_state->guest_DS  = 0;
   vex_state->guest_ES  = 0;
   vex_state->guest_FS  = 0;
   vex_state->guest_GS  = 0;
   vex_state->guest_SS  = 0;
   vex_state->guest_LDT = 0;
   vex_state->guest_GDT = 0;

   vex_state->guest_EMNOTE = EmNote_NONE;

   /* SSE2 has a 'clflush' cache-line-invalidator which uses these. */
   vex_state->guest_CMSTART = 0;
   vex_state->guest_CMLEN   = 0;

   vex_state->guest_NRADDR   = 0;
   vex_state->guest_SC_CLASS = 0;
   vex_state->guest_IP_AT_SYSCALL = 0;

   vex_state->padding1 = 0;
   vex_state->padding2 = 0;
   vex_state->padding3 = 0;
}


/* Figure out if any part of the guest state contained in minoff
   .. maxoff requires precise memory exceptions.  If in doubt return
   True (but this generates significantly slower code).  

   By default we enforce precise exns for guest %ESP, %EBP and %EIP
   only.  These are the minimum needed to extract correct stack
   backtraces from x86 code.

   Only %ESP is needed in mode VexRegUpdSpAtMemAccess.   
*/
Bool guest_x86_state_requires_precise_mem_exns (
        Int minoff, Int maxoff, VexRegisterUpdates pxControl
     )
{
   Int ebp_min = offsetof(VexGuestX86State, guest_EBP);
   Int ebp_max = ebp_min + 4 - 1;
   Int esp_min = offsetof(VexGuestX86State, guest_ESP);
   Int esp_max = esp_min + 4 - 1;
   Int eip_min = offsetof(VexGuestX86State, guest_EIP);
   Int eip_max = eip_min + 4 - 1;

   if (maxoff < esp_min || minoff > esp_max) {
      /* no overlap with esp */
      if (pxControl == VexRegUpdSpAtMemAccess)
         return False; // We only need to check stack pointer.
   } else {
      return True;
   }

   if (maxoff < ebp_min || minoff > ebp_max) {
      /* no overlap with ebp */
   } else {
      return True;
   }

   if (maxoff < eip_min || minoff > eip_max) {
      /* no overlap with eip */
   } else {
      return True;
   }

   return False;
}


#define ALWAYSDEFD(field)                           \
    { offsetof(VexGuestX86State, field),            \
      (sizeof ((VexGuestX86State*)0)->field) }

VexGuestLayout
   x86guest_layout 
      = { 
          /* Total size of the guest state, in bytes. */
          .total_sizeB = sizeof(VexGuestX86State),

          /* Describe the stack pointer. */
          .offset_SP = offsetof(VexGuestX86State,guest_ESP),
          .sizeof_SP = 4,

          /* Describe the frame pointer. */
          .offset_FP = offsetof(VexGuestX86State,guest_EBP),
          .sizeof_FP = 4,

          /* Describe the instruction pointer. */
          .offset_IP = offsetof(VexGuestX86State,guest_EIP),
          .sizeof_IP = 4,

          /* Describe any sections to be regarded by Memcheck as
             'always-defined'. */
          .n_alwaysDefd = 24,

          /* flags thunk: OP and NDEP are always defd, whereas DEP1
             and DEP2 have to be tracked.  See detailed comment in
             gdefs.h on meaning of thunk fields. */
          .alwaysDefd 
             = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
                 /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
                 /*  2 */ ALWAYSDEFD(guest_DFLAG),
                 /*  3 */ ALWAYSDEFD(guest_IDFLAG),
                 /*  4 */ ALWAYSDEFD(guest_ACFLAG),
                 /*  5 */ ALWAYSDEFD(guest_EIP),
                 /*  6 */ ALWAYSDEFD(guest_FTOP),
                 /*  7 */ ALWAYSDEFD(guest_FPTAG),
                 /*  8 */ ALWAYSDEFD(guest_FPROUND),
                 /*  9 */ ALWAYSDEFD(guest_FC3210),
                 /* 10 */ ALWAYSDEFD(guest_CS),
                 /* 11 */ ALWAYSDEFD(guest_DS),
                 /* 12 */ ALWAYSDEFD(guest_ES),
                 /* 13 */ ALWAYSDEFD(guest_FS),
                 /* 14 */ ALWAYSDEFD(guest_GS),
                 /* 15 */ ALWAYSDEFD(guest_SS),
                 /* 16 */ ALWAYSDEFD(guest_LDT),
                 /* 17 */ ALWAYSDEFD(guest_GDT),
                 /* 18 */ ALWAYSDEFD(guest_EMNOTE),
                 /* 19 */ ALWAYSDEFD(guest_SSEROUND),
                 /* 20 */ ALWAYSDEFD(guest_CMSTART),
                 /* 21 */ ALWAYSDEFD(guest_CMLEN),
                 /* 22 */ ALWAYSDEFD(guest_SC_CLASS),
                 /* 23 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
               }
        };


/*---------------------------------------------------------------*/
/*--- end                                 guest_x86_helpers.c ---*/
/*---------------------------------------------------------------*/
